1.1 Filter the data to include only rows where Year is 1962 and then make a scatter plot comparing ‘CO2 emissions (metric tons per capita)’ and gdpPercap for the filtered data.
# Question 1
year_1962 <- filter(my_data, Year == 1962)
clean_1962 <- drop_na(year_1962, `CO2 emissions (metric tons per capita)` | gdpPercap)
# 1.1
cor_plot <- ggplot(clean_1962, aes(x = `CO2 emissions (metric tons per capita)`, y = gdpPercap)) +
geom_point() +
ggtitle("GDP vs CO2 emissions in 1962")
plot(cor_plot)1.2 Calculate the correlation of ‘CO2 emissions (metric tons per capita)’ and gdpPercap. What is the correlation and associated p value?
# 1.2
corr_gdp_co2 <- cor(clean_1962$`CO2 emissions (metric tons per capita)`, clean_1962$gdpPercap)
Corr_info <- cor.test(clean_1962$`CO2 emissions (metric tons per capita)`, clean_1962$gdpPercap)
p_value <- Corr_info$p.value
print(paste("Correlation between CO2 emission & gdpPercap in 1962 is", corr_gdp_co2, "p-value:", p_value))## [1] "Correlation between CO2 emission & gdpPercap in 1962 is 0.926081672501947 p-value: 1.12867922100394e-46"
1.3 In what year is the correlation between ‘CO2 emissions (metric tons per capita)’ and gdpPercap the strongest?
# 1.3
Strong_year_df <- data.frame(years = my_data$Year, co2_emission = my_data$`CO2 emissions (metric tons per capita)`, gdp = my_data$gdpPercap)
cleaned_strong_year <- drop_na(Strong_year_df)
correlation_results <- cleaned_strong_year %>%
group_by(years) %>%
summarize(correlation = cor(co2_emission, gdp, use = "complete.obs"))
strongest_correlation_year <- correlation_results %>%
filter(correlation == max(correlation)) %>%
select(years, correlation)
print(paste("The year with strongest correlation between CO2 emission & gdpPercap is", strongest_correlation_year[1]))## [1] "The year with strongest correlation between CO2 emission & gdpPercap is 1967"
1.4 Create an interactive scatter plot comparing ‘CO2 emissions (metric tons per capita)’ and gdpPercap, where the point size is determined by pop (population) and the color is determined by the continent.
# 1.4
year_1967 <- filter(my_data, Year == 1967)
clean_1967 <- drop_na(year_1967, `CO2 emissions (metric tons per capita)` | gdpPercap)
cor_plot_1967 <- ggplot(clean_1967, aes(x = `CO2 emissions (metric tons per capita)`, y = gdpPercap, color = continent, size = pop)) +
geom_point() +
ggtitle("GDP vs CO2 emissions in 1967")
interactive_plot <- ggplotly(cor_plot_1967)
interactive_plotWhat is the relationship between continent and ‘Energy use (kg of oil equivalent per capita)’?
data_continent_energy <- data.frame(Continent = my_data$continent, Energy = my_data$`Energy use (kg of oil equivalent per capita)`)
clean_continent_energy <- na.omit(data_continent_energy)
# Perform anova test
anova_model <- aov(Energy ~ Continent, data = clean_continent_energy)
anova_summary <- summary(anova_model)
# Extract the F-value and p-value
f_value <- anova_summary[[1]]["Continent", "F value"]
p_value <- anova_summary[[1]]["Continent", "Pr(>F)"]
# Perform Tukey's Honest Significant Difference test
tukey_result <- TukeyHSD(anova_model)
# Create a box plot for Continent and Energy usage
Q2_plot <- ggplot(clean_continent_energy, aes(x = Continent, y = Energy, fill = Continent)) +
geom_boxplot() +
ggtitle("Energy Usage by Continent") +
xlab("Continent") +
ylab("Energy Used")
interactive_Q2_plot <- ggplotly(Q2_plot)
interactive_Q2_plotcat("F-value:", f_value, "\n", "p-value:", p_value, "\n")## F-value: 51.45916
## p-value: 8.527003e-39
print(tukey_result)## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = Energy ~ Continent, data = clean_continent_energy)
##
## $Continent
## diff lwr upr p adj
## Americas-Africa 1005.1037 466.8326 1543.3748 0.0000041
## Asia-Africa 1168.7636 628.2529 1709.2742 0.0000000
## Europe-Africa 2447.5453 1947.3838 2947.7067 0.0000000
## Oceania-Africa 3281.7976 2040.3410 4523.2543 0.0000000
## Asia-Americas 163.6599 -384.4160 711.7357 0.9256447
## Europe-Americas 1442.4416 934.1141 1950.7691 0.0000000
## Oceania-Americas 2276.6940 1031.9249 3521.4630 0.0000069
## Europe-Asia 1278.7817 768.0833 1789.4801 0.0000000
## Oceania-Asia 2113.0341 867.2950 3358.7732 0.0000402
## Oceania-Europe 834.2524 -394.5176 2063.0223 0.3421942
Relationship between Continents and their Energy usage: Significant differences in mean energy use were found between Africa and all other continents. Europe has significantly higher mean energy use compared to both Africa and the Americas. Oceania has significantly higher mean energy use compared to Africa, the Americas, and Asia. No significant difference was found between Asia and the Americas, or between Oceania and Europe.
An ANOVA (Analysis of Variance) test is performed to determine if there are statistically significant differences in the mean values of a numerical variable (in this case, “energy used”) across different categories of a categorical variable (in this case, “continents”), because ANOVA is specifically designed to compare the means of a numerical variable across multiple groups or categories.
Is there a significant difference between Europe and Asia with respect to ‘Imports of goods and services (% of GDP)’ in the years after 1990?
EUR_ASIA_1990 <- filter(my_data, (continent == "Europe" | continent == "Asia") & Year > 1990)
Cleaned_Eur_asia_by_import <- drop_na(EUR_ASIA_1990, `Imports of goods and services (% of GDP)`)
df_eur_asia <- data.frame(Continent = Cleaned_Eur_asia_by_import$continent, Import_percentage = Cleaned_Eur_asia_by_import$`Imports of goods and services (% of GDP)`)
# Perform the t-test
t_test_result <- t.test(Import_percentage ~ Continent, data = df_eur_asia)
# Create a box plot for Continent and import %
Q3_plot <- ggplot(df_eur_asia, aes(x = Continent, y = Import_percentage, fill = Continent)) +
geom_boxplot() +
ggtitle("Box Plot of Import Percentage by Continent") +
xlab("Continent") +
ylab("Import Percentage")
interactive_Q3_plot <- ggplotly(Q3_plot)
interactive_Q3_plotprint(t_test_result)##
## Welch Two Sample t-test
##
## data: Import_percentage by Continent
## t = 1.3552, df = 137.53, p-value = 0.1776
## alternative hypothesis: true difference in means between group Asia and group Europe is not equal to 0
## 95 percent confidence interval:
## -2.321099 12.433240
## sample estimates:
## mean in group Asia mean in group Europe
## 46.84531 41.78924
Since the p value is 0.17 (which is greater than 0.05), There is no statistically significant difference between the import percentages of Asia and Europe.
T test is performed to determine if there is a significant difference between Europe and Asia with respect to import percentage, because this statistical test compares the means of two independent groups (in this case, Europe and Asia) to see if they are statistically significantly different from each other.
What is the country (or countries) that has the highest ‘Population density (people per sq. km of land area)’ across all years?
Population_country_data <- select(my_data, `Country Name`, `Population density (people per sq. km of land area)`)
Clean_pop_con <- drop_na(Population_country_data)
Grouped_data <- group_by(Clean_pop_con, `Country Name`)
average_density <- summarise(Grouped_data, avg_density = mean(`Population density (people per sq. km of land area)`))
max_density_country <- filter(average_density, avg_density == max(avg_density))
# Plotting
sorted_data <- average_density %>% arrange(desc(avg_density))
top_countries <- head(sorted_data, n = 10)
Q4_plot <- ggplot(top_countries, aes(x = reorder(`Country Name`, avg_density), y = avg_density)) +
geom_bar(stat = "identity", fill = "orange") +
ggtitle("Top 10 Countries with the Max average pop density") +
xlab("Country") +
ylab("Average Population density") +
coord_flip()
interactive_Q4_plot <- ggplotly(Q4_plot)
interactive_Q4_plotpaste("The country with max average population density is ", max_density_country[1])## [1] "The country with max average population density is Macao SAR, China"
What country (or countries) has shown the greatest increase in ‘Life expectancy at birth, total (years)’ between 1962 and 2007?
Year_range_data <- filter(my_data, Year >= 1962 & Year <= 2007)
Life_country_data <- select(Year_range_data, `Country Name`, `Life expectancy at birth, total (years)`)
Cleaned_Lif_con <- drop_na(Life_country_data)
Grouped_li_con <- group_by(Cleaned_Lif_con, `Country Name`)
Increased_life_expectancy <- summarise(Grouped_li_con, increase = max(`Life expectancy at birth, total (years)`) - min(`Life expectancy at birth, total (years)`))
greatest_increase <- filter(Increased_life_expectancy, increase == max(increase))
# Plotting
sorted_data <- Increased_life_expectancy %>% arrange(desc(increase))
top_countries <- head(sorted_data, n = 10)
Q5_plot <- ggplot(top_countries, aes(x = reorder(`Country Name`, increase), y = increase)) +
geom_bar(stat = "identity", fill = "purple") +
ggtitle("Top 10 Countries with the Greatest Increase in Life Expectancy") +
xlab("Country") +
ylab("Increase in Life Expectancy (years)") +
coord_flip()
interactive_Q5_plot <- ggplotly(Q5_plot)
interactive_Q5_plotpaste("The Country with greatest increase in Life expectancy is", greatest_increase[1])## [1] "The Country with greatest increase in Life expectancy is Cambodia"